Tumor evolution project

Data used

In this notebook, we are using the tmb_genomic.tsv file generated from the 01-preprocess-data.Rmd script.

Set up

suppressPackageStartupMessages({
  library(tidyverse)
})

Directories and File Inputs/Outputs

# Detect the ".git" folder. This will be in the project root directory.
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from.
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
scratch_dir <- file.path(root_dir, "scratch")
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal") 
input_dir <- file.path(analysis_dir, "input")

# Input files
tmb_genomic_file <- file.path(scratch_dir, "tmb_genomic.tsv")
tumor_descriptor_color_palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")

# File path to plots directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(analysis_dir, "/util/function-create-barplot-v1.R"))
source(paste0(root_dir, "/figures/scripts/theme.R"))

Read in data and process

# Read and process tmb_genomic file
tmb_genomic_all <- readr::read_tsv(tmb_genomic_file, guess_max = 100000, show_col_types = FALSE) 

# Are there any samples with both WGS and WXS? 
tmb_genomic_all %>% 
  unique() %>% 
  arrange(Kids_First_Participant_ID, experimental_strategy)  %>%
  group_by(Kids_First_Participant_ID) %>%
  summarise(experimental_strategy_sum = str_c(experimental_strategy, collapse = ";")) 

# Yes, they are, so let's remove these from downstream analyses.
tmb_genomic <- tmb_genomic_all %>% 
  filter(!experimental_strategy == "WXS")

# Read color palette
tumor_descriptor_color_palette <- readr::read_tsv(tumor_descriptor_color_palette_file, guess_max = 100000, show_col_types = FALSE)

TMB per Kids_First_Participant_ID

We will explore TMB per Kids_First_Participant_ID over time by creating stacked barplots.

# Define parameters for function
ylim <- 360
tmb_df <- tmb_genomic

# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic.pdf"
p <- create_stacked_barplot(tmb_df = tmb_df, ylim = ylim)
pdf(file = fname, width = 15, height = 6)
print(p)
dev.off()
quartz_off_screen 
                2 

Note that samples with Low TMB defined as ≤5 mutations/Mb, intermediate TMB defined as >5 and ≤20/Mb, high TMB defined as >20 and ≤50 Mb, and very high TMB defined as >50 mutations/Mb.

We notice that there are samples with high TMB (hyper-mutant samples). Next, we will exclude these samples (threshold >= 20) from downstream analysis. Attention is needed in cases with high number of mutations in only one timepoint as this will lead to un-matched longitudinal samples. We will also remove those so we always have matched longitudinal samples.

# Filter df
tmb_genomic_filter <- tmb_genomic %>%
  filter(!tmb >= 20)  %>%
  unique() %>% 
  arrange(Kids_First_Participant_ID, tumor_descriptor) %>%
  group_by(Kids_First_Participant_ID) %>%
  summarise(tumor_descriptor_sum = str_c(tumor_descriptor, collapse = ";")) %>% 
  filter(!tumor_descriptor_sum %in% c("Diagnosis", "Progressive", "Recurrence")) %>% 
  left_join(tmb_genomic, by = c("Kids_First_Participant_ID", "tumor_descriptor_sum")) %>% 
  mutate(cancer_group_sum = ifelse(short_histology == "HGAT", "High-grade glioma",
                                   ifelse(short_histology == "LGAT", "Low-grade glioma", "Other cancer group")),
         cancer_group_sum = replace_na(cancer_group_sum, "Other"),
         patient_id = paste(short_histology, Kids_First_Participant_ID, sep = "_"),
         patient_bs_id = paste(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, sep = "_")) %>% 
  drop_na(tmb)

# Define parameters for function
ylim <- 12.5
tmb_df <- tmb_genomic_filter

# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic-no-hypermutants.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-no-hypermutants.pdf"
p <- create_stacked_barplot(tmb_df = tmb_df, ylim = ylim)
pdf(file = fname, width = 15, height = 8)
print(p)
dev.off()
quartz_off_screen 
                2 

TMB per patient_id and cancer_group_sum

We will explore TMB per cancer group over time by creating stacked barplots. We will plot based on cancer groups presenting with the highest number of samples (High- and Low-grade gliomas) vesrus any other cancer groups.

cancer_groups <- unique(as.character(tmb_genomic_filter$cancer_group_sum))
print(cancer_groups)
[1] "Other cancer group" "Low-grade glioma"   "High-grade glioma" 
for (i in seq_along(cancer_groups)) {
  print(i)
  tmb_genomic_filter_sub <- tmb_genomic_filter %>%
    filter(cancer_group_sum == cancer_groups [i])
  
  if(i == 1) {
    print (cancer_groups [i])
    # Define parameters for function
    ylim <- 8

  } else if (i == 2) {
    print (cancer_groups [i])
    # Define parameters for function
    ylim <- 4.5
    } else {
    print (cancer_groups [i])
    # Define parameters for function
    ylim <- 12.5
    }

  # Run function
  fname <- paste0(plots_dir, "/", "TMB-genomic", "-", cancer_groups[i], ".pdf")
  print(fname)
  p <- create_stacked_barplot_cancer_group_sum(tmb_df = tmb_genomic_filter_sub, ylim = ylim, ct_id = cancer_groups[i])
  pdf(file = fname, width = 12, height = 8)
  print(p)
  dev.off()
}
[1] 1
[1] "Other cancer group"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-Other cancer group.pdf"
[1] 2
[1] "Low-grade glioma"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-Low-grade glioma.pdf"
[1] 3
[1] "High-grade glioma"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-High-grade glioma.pdf"

Number of mutations per patient_bs_id

Here, we want to explore the number of mutations (mutation_count column) per timepoint and biospecimen sample per patient case by creating barplots.


# Define parameters for function
ylim <- 260
tmb_df = tmb_genomic_filter

# Run function
fname <- paste0(plots_dir, "/", "Total-Mutations-patient_bs_id.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Total-Mutations-patient_bs_id.pdf"
p <- create_barplot_sample(tmb_df = tmb_df,
                           ylim = ylim)
pdf(file = fname, width = 25, height = 10)
print(p)
dev.off()
quartz_off_screen 
                2 

sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.4.1

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggthemes_4.2.4  lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0   dplyr_1.1.2     purrr_1.0.1     readr_2.1.4    
 [8] tidyr_1.3.0     tibble_3.2.1    ggplot2_3.4.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] pillar_1.9.0      compiler_4.2.3    bslib_0.5.0       jquerylib_0.1.4   tools_4.2.3       bit_4.0.5        
 [7] digest_0.6.33     timechange_0.2.0  jsonlite_1.8.7    evaluate_0.21     lifecycle_1.0.3   gtable_0.3.3     
[13] pkgconfig_2.0.3   rlang_1.1.1       cli_3.6.1         rstudioapi_0.15.0 parallel_4.2.3    yaml_2.3.7       
[19] xfun_0.39         fastmap_1.1.1     withr_2.5.0       knitr_1.43        generics_0.1.3    vctrs_0.6.3      
[25] sass_0.4.7        hms_1.1.3         bit64_4.0.5       rprojroot_2.0.3   tidyselect_1.2.0  glue_1.6.2       
[31] R6_2.5.1          fansi_1.0.4       vroom_1.6.3       rmarkdown_2.23    farver_2.1.1      tzdb_0.4.0       
[37] magrittr_2.0.3    scales_1.2.1      htmltools_0.5.5   colorspace_2.1-0  labeling_0.4.2    utf8_1.2.3       
[43] stringi_1.7.12    munsell_0.5.0     cachem_1.0.8      crayon_1.5.2     
---
title: "Create TMB barplots of tumors across multiple timepoints of the PBTA Cohort"
author: "Antonia Chroni <chronia@chop.edu> for D3B"
date: "2023"
output:
  html_notebook:
    toc: TRUE
    toc_float: TRUE
---

#### Tumor evolution project 

### Data used 
In this notebook, we are using the `tmb_genomic.tsv` file generated from the `01-preprocess-data.Rmd` script.

# Set up
```{r load-library}
suppressPackageStartupMessages({
  library(tidyverse)
})
```

# Directories and File Inputs/Outputs
```{r set-dir-and-file-names}
# Detect the ".git" folder. This will be in the project root directory.
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from.
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
scratch_dir <- file.path(root_dir, "scratch")
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal") 
input_dir <- file.path(analysis_dir, "input")

# Input files
tmb_genomic_file <- file.path(scratch_dir, "tmb_genomic.tsv")
tumor_descriptor_color_palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")

# File path to plots directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(analysis_dir, "/util/function-create-barplot-v1.R"))
source(paste0(root_dir, "/figures/scripts/theme.R"))
```

# Read in data and process
```{r read_input_files}
# Read and process tmb_genomic file
tmb_genomic_all <- readr::read_tsv(tmb_genomic_file, guess_max = 100000, show_col_types = FALSE) 

# Are there any samples with both WGS and WXS? 
tmb_genomic_all %>% 
  unique() %>% 
  arrange(Kids_First_Participant_ID, experimental_strategy)  %>%
  group_by(Kids_First_Participant_ID) %>%
  summarise(experimental_strategy_sum = str_c(experimental_strategy, collapse = ";")) 

# Yes, they are, so let's remove these from downstream analyses.
tmb_genomic <- tmb_genomic_all %>% 
  filter(!experimental_strategy == "WXS")

# Read color palette
tumor_descriptor_color_palette <- readr::read_tsv(tumor_descriptor_color_palette_file, guess_max = 100000, show_col_types = FALSE)
```

# TMB per Kids_First_Participant_ID
We will explore TMB per `Kids_First_Participant_ID` over time by creating stacked barplots.

```{r create-stacked-barplot, fig.width = 15, fig.height = 6, fig.fullwidth = TRUE}
# Define parameters for function
ylim <- 360
tmb_df <- tmb_genomic

# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic.pdf")
print(fname)
p <- create_stacked_barplot(tmb_df = tmb_df, ylim = ylim)
pdf(file = fname, width = 15, height = 6)
print(p)
dev.off()
```
Note that samples with Low TMB defined as ≤5 mutations/Mb, intermediate TMB defined as >5 and ≤20/Mb, high TMB defined as >20 and ≤50 Mb, and very high TMB defined as >50 mutations/Mb.

We notice that there are samples with high TMB (hyper-mutant samples). Next, we will exclude these samples (threshold >= 20) from downstream analysis. Attention is needed in cases with high number of mutations in only one timepoint as this will lead to un-matched longitudinal samples. We will also remove those so we always have matched longitudinal samples.

```{r create-stacked-barplot-filter, fig.width = 15, fig.height = 8, fig.fullwidth = TRUE}
# Filter df
tmb_genomic_filter <- tmb_genomic %>%
  filter(!tmb >= 20)  %>%
  unique() %>% 
  arrange(Kids_First_Participant_ID, tumor_descriptor) %>%
  group_by(Kids_First_Participant_ID) %>%
  summarise(tumor_descriptor_sum = str_c(tumor_descriptor, collapse = ";")) %>% 
  filter(!tumor_descriptor_sum %in% c("Diagnosis", "Progressive", "Recurrence")) %>% 
  left_join(tmb_genomic, by = c("Kids_First_Participant_ID", "tumor_descriptor_sum")) %>% 
  mutate(cancer_group_sum = ifelse(short_histology == "HGAT", "High-grade glioma",
                                   ifelse(short_histology == "LGAT", "Low-grade glioma", "Other cancer group")),
         cancer_group_sum = replace_na(cancer_group_sum, "Other"),
         patient_id = paste(short_histology, Kids_First_Participant_ID, sep = "_"),
         patient_bs_id = paste(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, sep = "_")) %>% 
  drop_na(tmb)

# Define parameters for function
ylim <- 12.5
tmb_df <- tmb_genomic_filter

# Run function
fname <- paste0(plots_dir, "/", "TMB-genomic-no-hypermutants.pdf")
print(fname)
p <- create_stacked_barplot(tmb_df = tmb_df, ylim = ylim)
pdf(file = fname, width = 15, height = 8)
print(p)
dev.off()
```

# TMB per patient_id and cancer_group_sum
We will explore TMB per cancer group over time by creating stacked barplots. We will plot based on cancer groups presenting with the highest number of samples (High- and Low-grade gliomas) vesrus any other cancer groups.

```{r create-stacked-barplot-filter-cancer-group-sum, fig.width = 10, fig.height = 6, fig.align='center', fig.show='hold'}
cancer_groups <- unique(as.character(tmb_genomic_filter$cancer_group_sum))
print(cancer_groups)

for (i in seq_along(cancer_groups)) {
  print(i)
  tmb_genomic_filter_sub <- tmb_genomic_filter %>%
    filter(cancer_group_sum == cancer_groups [i])
  
  if(i == 1) {
    print (cancer_groups [i])
    # Define parameters for function
    ylim <- 8

  } else if (i == 2) {
    print (cancer_groups [i])
    # Define parameters for function
    ylim <- 4.5
    } else {
    print (cancer_groups [i])
    # Define parameters for function
    ylim <- 12.5
    }

  # Run function
  fname <- paste0(plots_dir, "/", "TMB-genomic", "-", cancer_groups[i], ".pdf")
  print(fname)
  p <- create_stacked_barplot_cancer_group_sum(tmb_df = tmb_genomic_filter_sub, ylim = ylim, ct_id = cancer_groups[i])
  pdf(file = fname, width = 12, height = 8)
  print(p)
  dev.off()
}
```

# Number of mutations per patient_bs_id
Here, we want to explore the number of mutations (`mutation_count` column) per timepoint and biospecimen sample per patient case by creating barplots.

```{r create-barplot-sample, fig.width = 25, fig.height = 10, fig.fullwidth = TRUE, fig.show='hold', fig.align='center', out.width  =  "50%"}

# Define parameters for function
ylim <- 260
tmb_df = tmb_genomic_filter

# Run function
fname <- paste0(plots_dir, "/", "Total-Mutations-patient_bs_id.pdf")
print(fname)
p <- create_barplot_sample(tmb_df = tmb_df,
                           ylim = ylim)
pdf(file = fname, width = 25, height = 10)
print(p)
dev.off()
```

```{r echo=TRUE}
sessionInfo()
```
